Boston AirBNB

EDA Analysis

  • Load the dataset listings.csv which includes information about AirBNB listings in Boston.
library(tidyverse)
listings <- read_csv("listings.csv")
Sys.setenv("MAPBOX_TOKEN" = "pk.eyJ1IjoicmljZW1hY2hpbmUiLCJhIjoiY2tuZXJ1Z2x0MDEweDJvcGF4OGZtcDR4ZiJ9.-vyiET9gaDK4jXrzueRdZw")
  • EAD Analysis
library(plotly)
(boston_airbnb <- listings)
## # A tibble: 2,959 x 16
##       id name          host_id host_name neighbourhood_g~ neighbourhood latitude
##    <dbl> <chr>           <dbl> <chr>     <lgl>            <chr>            <dbl>
##  1  3781 HARBORSIDE-W~    4804 Frank     NA               East Boston       42.4
##  2  6695 $99 Special!~    8229 Terry     NA               Roxbury           42.3
##  3 10813 Back Bay Apt~   38997 Michelle  NA               Back Bay          42.4
##  4 10986 North End (W~   38997 Michelle  NA               North End         42.4
##  5 13247 Back Bay stu~   51637 Susan     NA               Back Bay          42.4
##  6 16384 Small Room i~   23078 Eric      NA               Beacon Hill       42.4
##  7 18711 The Dorset R~   71783 Lance     NA               Dorchester        42.3
##  8 22195 Copley House~   85130 Copley    NA               Back Bay          42.3
##  9 22354 COPLEY SQ...~   85770 Robert    NA               South End         42.3
## 10 40601 Private room~  174986 Robert    NA               Jamaica Plain     42.3
## # ... with 2,949 more rows, and 9 more variables: longitude <dbl>,
## #   room_type <chr>, price <dbl>, minimum_nights <dbl>,
## #   number_of_reviews <dbl>, last_review <date>, reviews_per_month <dbl>,
## #   calculated_host_listings_count <dbl>, availability_365 <dbl>
summary(listings$price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    69.0   110.0   148.6   174.0 10000.0
head(listings$price[order(listings$price)], 20)
##  [1]  0  0  0  0 19 20 21 21 23 25 25 25 25 25 26 26 26 27 27 28
tail(listings$price[order(listings$price)], 20)
##  [1]   850   890   900   948   950   975   999  1000  1000  1000  1000  1000
## [13]  1052  1200  1700  1995  2116  3999 10000 10000
plot_ly(listings, y = ~price, type = "box")
# So, there are 8 units priced over 1000:
sum(listings$price > 1000)
## [1] 8
listings %>% 
  filter(price > 1000) %>%
  select(name, room_type, price)
## # A tibble: 8 x 3
##   name                                             room_type       price
##   <chr>                                            <chr>           <dbl>
## 1 Large, modern 2br, 2ba renovated 6/16 with patio Entire home/apt  3999
## 2 The Historic House in the North End/Waterfront   Entire home/apt  1052
## 3 Quiet getaway in a Boston home                   Entire home/apt  1200
## 4 Fun Retreat 13BR 23beds near Downtown FreePrk    Entire home/apt  2116
## 5 Boston Homestel, Double Bed                      Private room    10000
## 6 Boston Homestel, 1 Double and 1 Single Bed       Private room    10000
## 7 Renovated Studio Apartment in South Boston!!!    Entire home/apt  1995
## 8 Bright Bedroom in Beautiful Apartment            Private room     1700
# The 10000 look like plain errors: 10000 for a private room with a double bed?!
# The other private rooms seem to be just overpriced.

# Let's examine more carefully the "Entire home/apt" category:

gg_pricey <- 
  listings %>%
    filter(room_type == "Entire home/apt") %>%
    select(price) %>%
    ggplot() +
      geom_histogram(aes(x = log(price, 10)))

ggplotly(gg_pricey)
# And there are 4 units priced at zero:
sum(listings$price == 0)
## [1] 4
listings %>% 
  filter(price == 0) %>%
  select(name, room_type, price)
## # A tibble: 4 x 3
##   name                          room_type  price
##   <chr>                         <chr>      <dbl>
## 1 The Revolution Hotel          Hotel room     0
## 2 Inn @ St. Botolph             Hotel room     0
## 3 Boston Fenway Inn             Hotel room     0
## 4 citizenM Boston North Station Hotel room     0
# The zero entries are probably just wrong.

# To deal with those I choose to remove all price == 0 and all price > 1000.
# In addition, I take log10 price so to better work with the default colorscale:
boston_airbnb <-
  listings %>%
    filter(price > 0 & price <= 1000)


# Create a plot that demonstrates the effect of neighborhood on price.

boston_airbnb <-
  listings %>%
    filter(price > 0 & price <= 1000) %>%
    mutate(log_price = log(price, 10))

lvls <- 
  boston_airbnb %>%
    group_by(neighbourhood) %>%
    summarise(m = median(price)) %>%
    arrange(m) %>%
    pull(neighbourhood)

(p0 <-
  plot_ly(
    boston_airbnb,
    x = ~factor(neighbourhood, lvls),
    y = ~price,
    type = "box",
    showlegend = FALSE,
    name = "") %>%
    layout(
      yaxis = list(type = "log", title = "log(price)"),
      xaxis = list(title = "", tickangle = -35)
      )
  )
# Organize the price data on a mapbox layer


p <-
  plot_mapbox(boston_airbnb) %>%
    add_markers(
      x = ~longitude,
      y = ~latitude,
      color = ~log(price, 10),
      name = "Log (base 10) of price",
      text = 
        ~paste(
          name, 
          "\nRoom type:", room_type,
          "\nPrice: ", price,
          "\nMinimum nights: ", minimum_nights
          ),
      hoverinfo = "text"
    ) %>%
    layout(
      mapbox =
        list(
          center = list(lat = 42.32, lon = -71.1),
          zoom = 9.5,
          style = "dark"
        )
    ) 
p
# GPX function
library(tmaptools)
mbta <- read_GPX("mbta.gpx")


stations <-
  mbta$waypoints %>%
    filter(grepl('Red Line|Green Line|Blue Line|Orange Line', type))

T_lines <-
  mbta$tracks %>%
    filter(grepl('Red Line|Green Line|Blue Line|Orange Line', name))

p %>% 
  add_sf(
    data = stations,
    inherit = FALSE,
    name = "MBTA T stations",
    text = ~name,
    hoverinfo = "text"
    ) %>% 
    add_sf(
      data = T_lines,
      text = ~name,
      hoverinfo = "text",
      name = "MBTA T lines"
      )
# Consider changing colors of lines according to color:
add_MBTA_line <- function(p, line_color) {
  res <-
    p %>% 
      add_sf(
        data = T_lines %>% filter(grepl(line_color, name, ignore.case = TRUE)),
        color = ~I(line_color),
        text = ~name,
        hoverinfo = "text",
        name = paste0(line_color, " line")
      )
  return(res)  
}

p1 <- 
  p %>% 
    add_sf(
      data = stations,
      name = "MBTA T stations",
      text = ~paste0(name, " (", type, ")"),
      hoverinfo = "text",
      color = I("pink"),
      size = I(30)
    ) %>%
    add_MBTA_line("red") %>%
    add_MBTA_line("green") %>%
    add_MBTA_line("orange") %>%
    add_MBTA_line("blue") 

p1 %>% colorbar(title = "log(price)")
boston_neighborhoods <- sf::st_read("Boston_Neighborhoods.kml")
## Reading layer `Boston_Neighborhoods' from data source 
##   `C:\Users\jacob\OneDrive\Desktop\UMass Stat\STAT697V\HW\HW8\Boston_Neighborhoods.kml' 
##   using driver `KML'
## Simple feature collection with 26 features and 2 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: -71.19125 ymin: 42.22792 xmax: -70.92278 ymax: 42.39699
## Geodetic CRS:  WGS 84
# Add the neighborhood boundaries to the map

p2 <-
  p1 %>%
    add_sf(
      inherit = FALSE,
      data = boston_neighborhoods,
      fill = "",
      name = "Neighborhoods Boundaries",
      text = ~Name,
      hoverinfo = "text"
    ) %>% 
    colorbar(title = "log(price)")
  • Final Version of Boston Airbnb Visualization
subplot(p0, p2, nrows = 2, heights = c(0.2, 0.8), margin = 0.1)